Notes to this new file: (March 3, 2024)
This file builds on the file “Xu_2023_v4.0_AllfirmsPorts.Rmd” and the replication file “Hongyi R conversion 2.html” (from Dean).
The purpose of this replication/updated file is to find the reconcile results from the above two files.
Currently, the treatment of negative earnings is still using a small, non-negative number for replacement.
Now the whole universe is the CRSP: NYSE + AMEX + NASDAQ.
timestamp()
## ##------ Thu Mar 14 08:07:34 2024 ------##
# 0. record datasets ----
## 0.1 initial value setup ----
freq = 12 # the frequency of the data <- 12 for monthly; 4 for quarterly; 1 for annually
start.ym = as.yearmon(1966) # -1/12 # the starting time
end.ym = as.yearmon(2020) # the ending time
month_select <- function(freq = freq) { # return the months for differnt time frequency
if (freq == 12) {
return(c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November","December"))
}
if (freq == 4) {
return(c("March", "June", "September", "December"))
}
if (freq == 1) {
return("December")
}
}
freq_name <- function(freq = freq) {
if (freq == 12) {
return("monthly")
}
if (freq == 4) {
return("quarterly")
}
if (freq == 1) {
return("annual")
}
}
## 0.2 preliminary data cleaning ----
# "Variables_DK_CRSP_monthly_196601_201912.csv"
# "PredictorDataDK_CRSPm.csv"
# "PredictorDataDK_monthly_all.csv" - DK replication
port_market <- read.csv("/Users/hongyixu/Library/CloudStorage/OneDrive-HandelshögskolaniStockholm/Xu_Katselas_Drienko._2021/R&R_JEF_PredictorDataDk_Dec3_2023/Variables_DK_CRSP_monthly_196601_201912.csv") %>%
as_tibble() %>%
mutate(E12 = ((E12*(E12 > 0) + 0.001*(E12 <= 0))) * 12/freq) %>%
filter(E12 > 0) %>%
mutate(month = as.yearmon(as.character(yyyymm), format = "%Y%m"),
vwret = rollsumr(log(1+vwret), 12/freq, NA) ) %>% # converted into the log returns.
filter(months(month) %in% month_select(freq = freq)) %>%
filter(month < end.ym)
plot(y = port_market$E12, x = port_market$month, typew = "b"); abline(h = 0, col = 'red', lty = 2)
#elif negative == 'truncate':
# df['E12'] = np.where(df['E12'] < 0, 0.001, df['E12'])
write.csv(x = port_market, file = "market_Allfirms.csv")
# port_market_raw <- port_market %>% select(month, vwret, R, date, GM, GE, DP)
ports_all <- read.csv("/Users/hongyixu/Library/CloudStorage/OneDrive-HandelshögskolaniStockholm/Xu_Katselas_Drienko._2021/R&R_JEF_PredictorDataDk_Dec3_2023/PredictorDataDK_szbm_CRSPm.csv") %>%
# PredictorDataDK_port_CRSPm.csv
# PredictorDataDK_szbm_CRSPm.csv
as_tibble() %>%
mutate(E12 = ((E12*(E12 > 0) + 0.001*(E12 <= 0))) * 12/freq) %>%
filter(E12 > 0) %>%
mutate(month = as.yearmon(as.Date(date, format = "%Y-%m-%d")),
port = as.character(port),
vwret = rollsumr(log(1+vwret), 12/freq, NA) ) %>%
filter(months(month) %in% month_select(freq = freq)) %>%
filter(month < end.ym)
ports <- unique(ports_all$port) # identifiers for portfolios
for (p in ports) {
ports.dt <- ports_all %>%
filter(port == p) %>%
arrange(month)
ports.dt <- ports.dt[, -1] # remove the column `port`
write.csv(x = ports.dt, file = paste("port_", p, ".csv", sep = ""))
}
## 0.3 name portfolios and predictors ----
market.names <- list.files(pattern = "market_")[1]
data.names <- list.files(pattern = "^port_") # data for portfolios
## generate the name of portfolios
## Define the two sets
#set1 <- c("B", "S")
#set2 <- c("H", "M", "L")
# Use expand.grid() to generate all combinations
#combinations <- expand.grid(set2, set1)[, c(2, 1)]
## Convert the result to a vector
# ports <- do.call(paste0, combinations) ## ----updated March 7, 2024
id.names <- c("Market", ports) # set plot names
ratio_names <- c("DP", "PE", "EY", "DY", "Payout") # potential predictors
## 0.4 risk-free rate -----
RF <- read.csv("/Users/hongyixu/Library/CloudStorage/OneDrive-HandelshögskolaniStockholm/Xu_Katselas_Drienko._2021/R&R_JEF_PredictorDataDk_Dec3_2023/Rfree_t30.csv") %>% # record the risk free rate
as.tbl() %>% # as the average of the bid and ask.
select(-X) %>%
mutate(month = as.yearmon(month)) %>%
filter(months(month) %in% month_select(freq = freq)) %>%
filter(month < end.ym)
Log cumulative realised portfolio return components for seven portfolios - the market portfolio and six size and book-to-market equity ratio sorted portfolios. All following figures demonstrate the monthly realised price-earnings ratio growth (gm), earnings growth (ge), dividend-price (dp) and the portfolio return index (r) with the values in January 1966 as zero for all portfolios.
# TABLE-1. summary statistics ----
TABLE1.uni <- list() # the univariate statistics
TABLE1.cor <- list() # the correlation matrixs
PE.df <- data.frame(month = port_market$month[port_market$month >= start.ym])
EY.df <- data.frame(month = port_market$month[port_market$month >= start.ym])
DP.df <- data.frame(month = port_market$month[port_market$month >= start.ym])
## (1*) summary tables for Summary & Correlations ----
for (c in 1:7) {
id <- c(market.names, data.names)[c]
# print(id); print(id.names[c])
## 1. read the data ----
data_nyse <- read.csv(id) %>%
as_tibble() %>%
mutate(month = as.yearmon(month)) %>%
filter(month >= start.ym) %>% # start from "Dec 1965"
select(month, r = vwret, P = Index, E = E12, D = D12, pe_raw = PE) %>%
# , R, GM_raw = GM, GE_raw = GE, DP_raw = DP
mutate(DP = D / P, # these are adjusted by the log transformation
PE = P / E,
EP = E / P,
EY = E / lag(P), # earnings yield
DY = D / lag(P), # dividend yield
Payout = D / E) # payout ratios
PE.df <- cbind.data.frame(PE.df, data_nyse$PE)
EY.df <- cbind.data.frame(EY.df, data_nyse$EY)
DP.df <- cbind.data.frame(DP.df, data_nyse$DP)
## 2. return decomposition ----
data_decompose <- data_nyse %>%
mutate(r = r, # cts returns = log total returns
gm = log(PE) - lag(log(PE)), # multiple expansion rate
ge = log(E) - lag(log(E)), # earnings growth rate
dp = log(1 + DP/freq)) %>% # only 1/12 of the dividends -----updated March 7, 2024
na.omit()
## 3. summary-Stat ----
ar1.coef <- function(x) {
return(as.numeric(lm(x ~ lag(x))$coefficients[2]))
} # return the function value of the coefficient for the AR(1) model
comp_summary.dt <- data_decompose %>%
select(gm, ge, dp, r) %>%
# , R, GM_raw, GE_raw, DP_raw
describe() %>%
mutate(mean = mean * 100,
sd = sd * 100,
median = median * 100,
min = min * 100,
max = max * 100) %>%
select(Mean = mean, Median = median, SD = sd, Min = min, Max = max, Skew = skew, Kurt = kurtosis) %>%
round(digits = 4)
comp_summary.dt$"AR(1)" <- data_decompose %>%
select(gm, ge, dp, r) %>%
apply(2, ar1.coef) %>%
round(digits = 4)
### Store the summary stat
# print(paste("Data starts from ", first(data_decompose$month), " and ends in ", last(data_decompose$month), ".", sep = ""))
TABLE1.uni[[id.names[c]]] <- comp_summary.dt
## 4. correlations ----
comp_cor <- data_decompose %>% select(gm, ge, dp, r) %>% cor()
TABLE1.cor[[id.names[c]]] <- comp_cor
# Figure-1. cumulative realised return components ----
# jpeg(filename = paste("Figure1_", id.names[c], ".jpeg", sep = ""), width = 550, height = 350)
par(mar = c(2, 4, 2, 1))
cum_components.ts <- data_decompose %>%
select(r, gm, ge, dp) %>%
apply(2, cumsum) %>%
ts(start = data_decompose$month[1], frequency = freq)
plot.ts(cum_components.ts, plot.type = "single", lty = 1:4, main = id.names[c], cex.main = 1,
xlab = NULL, ylab = "Cumulative Return and Components Indices")
legend("topleft",
legend = c("Total return", "Price earnings growth",
"Earnings growth", "Dividend price"),
lty = 1:4,
cex = 1.0) # text size
# dev.off()
par(mar = c(5, 4, 4, 2) + 0.1)
}
write.csv(TABLE1.uni, file = "table_1.uni.csv")
write.csv(TABLE1.cor, file = "table_1.cor.csv")
.
The correlations between gm and ge might be a bit too high comparing to Ferreira and Santa-Clara (2011). Need to check the code again.
## summary data for table1
rowname <- rep(colnames(TABLE1.cor$Market), length(names(TABLE1.uni)))
portname = rep(names(TABLE1.uni), each = length(colnames(TABLE1.cor$Market)))
table1 <- do.call(rbind.data.frame, TABLE1.uni) %>%
cbind.data.frame(do.call(rbind.data.frame, TABLE1.cor)) %>%
round(digits = 2) %>%
cbind.data.frame(rowname, portname)
## give the table 1 outputs
gt(data = table1, rowname_col = "rowname", groupname_col = "portname") %>%
tab_header(title = "Table 1 - Summary statistics of returns components",
subtitle = paste(freq_name(freq = freq), " data starts from ", first(data_decompose$month), " and ends in ", last(data_decompose$month), ".", sep = "")) %>%
tab_spanner(label = "Panel A: univariate statistics",
columns = vars(Mean, Median, SD, Min, Max, Skew, Kurt, "AR(1)")) %>%
tab_spanner(label = "Panel B: Correlations",
columns = vars(gm, ge, dp, r)) %>%
tab_source_note(source_note = html("Note: Panel A in this table presents mean, median, standard deviation (SD), minimum, maximum, skewness (Skew), kurtosis (kurt) and first-order autocorrelation coefficient of the realised components of stock market returns and six size and book-to-market equity ratio sorted portfolios. These univariate statistics for each portfolios are presented separately. gm is the continuously compounded growth rate in the price-earnings ratio. ge is the continuously compounded growth rate in earnings. dp is the log of one plus the dividend-price ratio. *r* is the portfolio returns. Panel B in this table reports correlation matrices for all seven portfolios. The sample period starts from Feburary 1966 and ends in December 2019."))
| Table 1 - Summary statistics of returns components | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| monthly data starts from Feb 1966 and ends in Dec 2019. | ||||||||||||
| Panel A: univariate statistics | Panel B: Correlations | |||||||||||
| Mean | Median | SD | Min | Max | Skew | Kurt | AR(1) | gm | ge | dp | r | |
| Market | ||||||||||||
| gm | -0.04 | 0.20 | 51.14 | -988.86 | 807.66 | -5.06 | 309.28 | 0.12 | 1.00 | -1.00 | 0.02 | 0.10 |
| ge | 0.76 | 0.90 | 50.89 | -802.36 | 991.68 | 5.31 | 315.21 | 0.12 | -1.00 | 1.00 | -0.02 | -0.01 |
| dp | 0.23 | 0.22 | 0.08 | 0.08 | 0.46 | 0.35 | -0.44 | 0.99 | 0.02 | -0.02 | 1.00 | -0.04 |
| r | 0.81 | 1.21 | 4.45 | -25.57 | 15.35 | -0.76 | 2.66 | 0.07 | 0.10 | -0.01 | -0.04 | 1.00 |
| BH | ||||||||||||
| gm | 0.10 | 0.24 | 188.73 | -1175.52 | 1168.88 | 0.00 | 26.08 | -0.36 | 1.00 | -1.00 | 0.01 | 0.05 |
| ge | 0.59 | 0.15 | 188.59 | -1168.39 | 1168.39 | 0.00 | 26.09 | -0.35 | -1.00 | 1.00 | -0.01 | -0.03 |
| dp | 0.39 | 0.35 | 0.18 | 0.12 | 0.87 | 0.55 | -0.58 | 0.98 | 0.01 | -0.01 | 1.00 | 0.01 |
| r | 0.89 | 1.32 | 4.63 | -24.01 | 19.29 | -0.62 | 2.64 | 0.06 | 0.05 | -0.03 | 0.01 | 1.00 |
| BL | ||||||||||||
| gm | -0.01 | 0.15 | 11.05 | -65.37 | 62.88 | 0.16 | 8.93 | -0.35 | 1.00 | -0.91 | -0.05 | 0.41 |
| ge | 0.72 | 0.91 | 10.06 | -60.98 | 63.95 | -0.07 | 13.74 | -0.45 | -0.91 | 1.00 | 0.01 | 0.01 |
| dp | 0.19 | 0.17 | 0.06 | 0.06 | 0.39 | 0.66 | -0.06 | 0.96 | -0.05 | 0.01 | 1.00 | -0.06 |
| r | 0.77 | 1.10 | 4.66 | -26.53 | 19.64 | -0.59 | 2.40 | 0.06 | 0.41 | 0.01 | -0.06 | 1.00 |
| BM | ||||||||||||
| gm | 0.03 | 0.19 | 67.00 | -1117.36 | 1107.45 | -0.21 | 232.54 | -0.05 | 1.00 | -1.00 | 0.02 | -0.03 |
| ge | 0.64 | 0.90 | 67.27 | -1117.29 | 1125.20 | 0.15 | 236.19 | -0.04 | -1.00 | 1.00 | -0.02 | 0.09 |
| dp | 0.32 | 0.29 | 0.13 | 0.13 | 0.80 | 0.90 | 0.44 | 0.99 | 0.02 | -0.02 | 1.00 | -0.06 |
| r | 0.81 | 1.15 | 4.31 | -22.04 | 16.35 | -0.63 | 2.71 | 0.04 | -0.03 | 0.09 | -0.06 | 1.00 |
| SH | ||||||||||||
| gm | 2.56 | 1.33 | 225.37 | -1495.12 | 1441.72 | 0.06 | 25.42 | -0.22 | 1.00 | -1.00 | 0.01 | 0.06 |
| ge | -1.36 | 0.00 | 225.14 | -1443.75 | 1486.75 | -0.08 | 25.48 | -0.22 | -1.00 | 1.00 | -0.02 | -0.04 |
| dp | 0.48 | 0.43 | 0.24 | 0.17 | 1.97 | 3.14 | 13.52 | 0.99 | 0.01 | -0.02 | 1.00 | -0.11 |
| r | 1.16 | 1.57 | 5.58 | -33.05 | 25.89 | -0.84 | 4.02 | 0.16 | 0.06 | -0.04 | -0.11 | 1.00 |
| SL | ||||||||||||
| gm | 2.25 | -0.54 | 209.91 | -1163.99 | 1282.51 | 0.17 | 20.88 | -0.31 | 1.00 | -1.00 | 0.03 | -0.05 |
| ge | -1.22 | 0.45 | 210.28 | -1275.66 | 1170.72 | -0.17 | 20.89 | -0.31 | -1.00 | 1.00 | -0.03 | 0.08 |
| dp | 0.18 | 0.15 | 0.13 | 0.03 | 0.92 | 2.62 | 8.99 | 0.97 | 0.03 | -0.03 | 1.00 | -0.04 |
| r | 0.65 | 1.24 | 6.86 | -39.25 | 25.46 | -0.76 | 2.91 | 0.14 | -0.05 | 0.08 | -0.04 | 1.00 |
| SM | ||||||||||||
| gm | 0.02 | -0.22 | 201.78 | -1464.86 | 1463.12 | -0.07 | 28.32 | -0.35 | 1.00 | -1.00 | 0.02 | -0.05 |
| ge | 1.10 | 0.86 | 202.11 | -1467.68 | 1470.85 | 0.07 | 28.43 | -0.34 | -1.00 | 1.00 | -0.02 | 0.08 |
| dp | 0.30 | 0.28 | 0.12 | 0.09 | 0.69 | 0.61 | 0.04 | 0.96 | 0.02 | -0.02 | 1.00 | -0.08 |
| r | 1.06 | 1.50 | 5.49 | -32.57 | 23.59 | -0.84 | 3.54 | 0.14 | -0.05 | 0.08 | -0.08 | 1.00 |
| Note: Panel A in this table presents mean, median, standard deviation (SD), minimum, maximum, skewness (Skew), kurtosis (kurt) and first-order autocorrelation coefficient of the realised components of stock market returns and six size and book-to-market equity ratio sorted portfolios. These univariate statistics for each portfolios are presented separately. gm is the continuously compounded growth rate in the price-earnings ratio. ge is the continuously compounded growth rate in earnings. dp is the log of one plus the dividend-price ratio. *r* is the portfolio returns. Panel B in this table reports correlation matrices for all seven portfolios. The sample period starts from Feburary 1966 and ends in December 2019. | ||||||||||||
# TABLE-2. different tables OOS R-square ----
TABLE2 <- list()
table2.df <- data.frame()
## (3*) repeat for each portfolio ----
c <- 0
for (id in c(market.names, data.names)) {
c <- c + 1
print(id); print(id.names[c])
## 1. read the data ----
data_nyse <- read.csv(id) %>%
as.tbl() %>%
mutate(month = as.yearmon(month)) %>%
filter(month >= start.ym) %>% # start from "Dec 1965"
select(month, r = vwret, P = Index, E = E12, D = D12) %>%
mutate(DP = D / P, # construct predictors
PE = P / E,
EP = E / P,
EY = E / lag(P), # earnings yield
DY = D / lag(P), # dividend yield
Payout = D / E) # payout ratios
## 2. return decomposition ----
data_decompose <- data_nyse %>% # also try PD ratio replacing PE.
mutate(r = r, # cts returns (has already being the log return in row 95)
gm = log(PE) - lag(log(PE)), # multiple expansion rate
ge = log(E) - lag(log(E)), # earnings growth rate
dp = log(1 + DP/freq)) %>% # see note 1.
# only 1/12 of the annualised dividend is included. > see note 1.
na.omit()
## 3. SOP predictions ----
k = freq * 20 # set a 20-year rolling window
data_pred <- data_decompose %>%
select(month, r, gm, ge, dp) %>%
mutate(mu_gm = 0,
mu_ge1 = rollmeanr(ge, k, fill = NA_real_), # rolling mean
mu_ge2 = c(rep(NA_real_, (k-1)), cummean(ge)[k:length(ge)]), # recursive mean
mu_ge3 = cummean(ge), # recursive mean from the beginning
a_DK1 = rollmeanr(r - dp, k, fill = NA_real_), # methods Eq (14/15) by DK
a_DK2 = cummean(r - dp), # methods Eq (14/15) by DK
mu_dp = dp,
mu_sop = mu_gm + mu_ge1 + mu_dp) %>% # the predictor > see note 2.
mutate(sop_simple = lag(mu_sop), # conditional predictions
hist_mean = (cummean(r)) ) # historical mean predictions
## 4. OOS R2 and MSE-F ----
### build the function for the bootstrap
Boot_MSE.F <- function(data, actual, cond, uncond, x, critical.values = TRUE, boot.times = 10000) {
## clean and reorganise the data
ports.dt <- data %>%
select(month, actual = actual, cond = cond, uncond = uncond, x = x)
ports.dt_narm <- ports.dt %>%
na.omit() %>%
mutate(error_A = actual - cond,
error_N = actual - uncond,
Cuml_MSE_A = cummean(error_A ^ 2),
Cuml_MSE_N = cummean(error_N ^ 2),
Cuml_OOS.rsquared = 1 - Cuml_MSE_A / Cuml_MSE_N,
Cum_SSE = cumsum(error_N ^2 - error_A ^ 2) )
### for the full (out-of-)sample
MSE_A <- mean(ports.dt_narm$error_A ^ 2)
MSE_N <- mean(ports.dt_narm$error_N ^ 2)
OOS_r.squared <- 1 - MSE_A / MSE_N # out-of-sample R square
MSE_F <- length(ports.dt_narm$month) * (MSE_N - MSE_A) / (MSE_A)
MAE_A <- mean(abs(ports.dt_narm$error_A)) %>% round(digits = 6) # Mean absolute error of the conditional model
print(paste("OOS R Squared: ", round(OOS_r.squared, digits = 4), sep = ""))
print(paste("MSE-F: ", round(MSE_F, digits = 4), sep = ""))
Cuml_OOS.ts <- ts(ports.dt_narm$Cuml_OOS.rsquared, start = ports.dt_narm$month[1], frequency = freq)
Cum_SSE.ts <- ts(ports.dt_narm$Cum_SSE, start = ports.dt_narm$month[1], frequency = freq)
## Bootstrapped MSE-F Stat
if (critical.values == TRUE) {
## get the data series for x and y
y0 <- ports.dt[is.na(ports.dt$cond), ]$actual
y1 <- ports.dt[!is.na(ports.dt$cond), ]$actual
x1 <- as.vector(na.omit(ports.dt$x))
## full sample estimation for the model
alpha <- mean(y1)
u1 <- y1 - alpha
x.lm <- lm(x1 ~ lag(x1))
mu <- as.numeric(coef(x.lm)[1])
rho <- as.numeric(coef(x.lm)[2])
u2 <- as.numeric(x.lm$residuals)
u <- data.frame(index = 1:length(u1), u1, u2) # the dataset storing all the residual info
### bootstrapping pairs of error terms
boot.critical <- c()
for (i in 1:boot.times) { # the bootstrapped times can be modified
index.boot <- sample(u$index, length(u1), replace = T)
u.boot <- data.frame()
for (j in index.boot) {
u.boot <- rbind.data.frame(u.boot, u[j,])
} # record the bootstrapped error terms
### reconstruct the simulated x and y
y1.new <- alpha + u.boot$u1
x0.new <- sample(x1, 1) # resample a value as the starting value of x
x1.new <- c(mu + rho * x0.new + u.boot$u2[1])
for (j in 2:length(y1.new)) {
x1.new[j] <- mu + x1.new[j-1] * rho + u.boot$u2[j]
} # simulate SOP values
### redo the rolling estimation
y.boot <- c(y0, y1.new)
x.boot <- c(rep(NA_real_, (length(y0) - 1)), x0.new, x1.new)
data.dt <- data.frame(month = ports.dt$month, x.boot, y.boot) %>%
as.tbl() %>%
mutate(conditional = lag(x.boot), # convert to the SOP prediction
unconditional = lag(cummean(y.boot))) %>% # convert to the historical mean prediction
na.omit()
error_N.boot = data.dt$y.boot - data.dt$unconditional
error_A.boot = data.dt$y.boot - data.dt$conditional
MSE_N.boot = mean(error_N.boot ^ 2)
MSE_A.boot = mean(error_A.boot ^ 2)
OOS_r.squared.boot <- 1 - MSE_A.boot / MSE_N.boot %>% round(digits = 6) # out-of-sample R square
### MSE-F statistic
MSE_F.boot <- length(error_N.boot) * (MSE_N.boot - MSE_A.boot) / (MSE_A.boot) %>% round(digits = 6)
boot.critical[i] <- MSE_F.boot
if (i %% (boot.times/10) == 0) {
timestamp()
print(paste("SOP", ": ", i %/% (boot.times/100), "%", sep = ""))
}
}
## store the results
result <- cbind.data.frame(IS_r.squared = NA_real_,
OOS_r.squared,
MAE_A, # MAE of conditional model
MSE_F,
t(quantile(boot.critical, probs = c(0.90, 0.95, 0.99))),
p.value = mean(boot.critical > MSE_F))
# d_RMSE = round(d_RMSE, digits = 4),
# DM_stat = round(DM_test.result$statistic, digits = 4),
# DM_pval = round(DM_test.result$p.value, digits = 4))
} else {
result <- cbind.data.frame(IS_r.squared = NA_real_,
OOS_r.squared,
MAE_A, # MAE of conditional model
MSE_F)
}
rownames(result) <- "SOP"
output <- list(result = result, Cuml_OOS.ts = Cuml_OOS.ts, Cum_SSE.ts = Cum_SSE.ts)
return(output)
}
### store the results for the SOP method
sop.result <- Boot_MSE.F(data = data_pred, actual = "r", cond = "sop_simple", uncond = "hist_mean", x = "mu_sop", critical.values = FALSE, boot.times = 3000)
sop.result$result
par(mfrow = c(2,1))
par(mar = c(3,5,3,2))
plot.ts(sop.result$Cuml_OOS.ts * 100, xlab = NULL, ylab = "as %", main = paste(id.names[c], ": Cumulative OOS R Squared Difference - SOP", sep = ""))
abline(h = c(0,1), lty = 2, col = 2)
par(mar = c(5,5,3,2))
plot.ts(sop.result$Cum_SSE.ts, ylab = NULL, cex.lab = 0.5,
xlab = "An increase in a line indicates better performance of the conditional model",
main = paste(id.names[c], ": Cumulative SSE Difference - SOP", sep = ""))
abline(h = 0, lty = 2, col = 2)
par(mfrow = c(1,1))
### store all cumulative OOS R2 difference values
Cuml_all.ts <- sop.result$Cuml_OOS.ts * 100
Csse_all.ts <- sop.result$Cum_SSE.ts
## 5. univariate predictive regressions ----
table2.uni_predictors <- data.frame()
for (predictor in ratio_names) {
## construct conditional & unconditional predictions
data_univariate <- data_decompose %>%
select(month, r, predictor) %>%
mutate(hist_mean = lag(cummean(r)),
x = lag(get(predictor)) %>% log) ## convert to log predictors
# data_univariate[[predictor]] <- log(data_univariate[[predictor]])
# data_univariate[["x"]] <- log(data_univariate[["x"]])
## IS R2
lm.IS <- lm(r ~ x, data = data_univariate)
IS_r.squared <- summary(lm.IS)$r.squared # in-sample r squared
IS_pval <- summary(lm.IS)$coefficients[2,4] # the p-value from F-statistic
## OOS recursive window predictions
k <- k # the starting in-sample data
con_pred = rep(NA_real_, nrow(data_univariate))
for (t in (k+2):nrow(data_univariate)) {
x.IS <- data_univariate$x[2:(t-1)]
y.IS <- data_univariate$r[2:(t-1)]
reg.IS <- lm(y.IS ~ x.IS)
x.new <- data_univariate$x[t]
y.pred <- predict(reg.IS, newdata = data.frame(x.IS = x.new))
con_pred[t] <- y.pred # store the prediction
}
data_univariate$con_pred <- con_pred
data_univariate
## Stat and Bootstrap
data = data_univariate
actual = "r"
cond = "con_pred"
uncond = "hist_mean"
critical.values = FALSE # decide whether the bootstrapped critical value is calculated > Note 3.
boot.times = 1000 * 10
{ ## get OOS R2 & MSE-F
ports.dt <- data %>%
select(month, actual = actual, cond = cond, uncond = uncond, predictor) %>%
rename(x = predictor)
ports.dt_narm <- ports.dt %>%
na.omit() %>%
mutate(error_A = actual - cond,
error_N = actual - uncond,
Cuml_MSE_A = cummean(error_A ^ 2),
Cuml_MSE_N = cummean(error_N ^ 2),
Cuml_OOS.rsquared = 1 - Cuml_MSE_A / Cuml_MSE_N,
Cum_SSE = cumsum(error_N ^2 - error_A ^ 2))
MSE_A <- mean(ports.dt_narm$error_A ^ 2)
MSE_N <- mean(ports.dt_narm$error_N ^ 2)
OOS_r.squared <- 1 - MSE_A / MSE_N # out-of-sample R square
MSE_F <- length(ports.dt_narm$month) * (MSE_N - MSE_A) / (MSE_A)
MAE_A <- mean(abs(ports.dt_narm$error_A)) # Mean absolute error of the conditional model
print(paste("IS R Squared: ", round(IS_r.squared, digits = 4), sep = ""))
print(paste("OOS R Squared: ", round(OOS_r.squared, digits = 4), sep = ""))
print(paste("MSE-F: ", round(MSE_F, digits = 4), sep = ""))
### combine the cumulative OOS R2 difference of other predictors & cum SSE
Cuml_pred.ts <- ts(ports.dt_narm$Cuml_OOS.rsquared * 100, start = ports.dt_narm$month[1], frequency = freq)
Cuml_all.ts <- ts.union(Cuml_all.ts, Cuml_pred.ts)
Cum_pred.sse <- ts(ports.dt_narm$Cum_SSE, start = ports.dt_narm$month[1], frequency = freq)
Csse_all.ts <- ts.union(Csse_all.ts, Cum_pred.sse)
## prepare the bootstrap
if (critical.values == TRUE) {
y0 <- ports.dt$actual[1]
y1 <- ports.dt$actual[-1]
x1 <- ports.dt$x
## full sample estimation for the model
alpha <- mean(y1)
u1 <- y1 - alpha
x.lm <- lm(x1 ~ lag(x1))
mu <- as.numeric(coef(x.lm)[1])
rho <- as.numeric(coef(x.lm)[2])
u2 <- as.numeric(x.lm$residuals)
u <- data.frame(index = 1:length(u1), u1, u2) # the dataset storing all the residual info
### bootstrap pairs of error terms
boot.critical <- c()
for (i in 1:boot.times) { # the bootstrapped times can be modified
index.boot <- sample(u$index, length(u1), replace = T)
u.boot <- data.frame()
for (j in index.boot) {
u.boot <- rbind.data.frame(u.boot, u[j,])
} # record the bootstrapped error terms
### reconstruct the simulated x and y
y1.new <- alpha + u.boot$u1
x0.new <- sample(x1, 1) # resample a value as the starting value of x
x1.new <- c(mu + rho * x0.new + u.boot$u2[1])
for (j in 2:length(y1.new)) {
x1.new[j] <- mu + x1.new[j-1] * rho + u.boot$u2[j]
} # simulate predictors
### redo the rolling estimation
y.boot <- c(y0, y1.new)
x.boot <- c(x0.new, x1.new)
data.dt <- as.tbl(data.frame(month = ports.dt$month, x.boot, y.boot, x = lag(x.boot)))
con_pred = rep(NA_real_, nrow(data.dt))
for (t in (k+2):nrow(data.dt)) {
x.IS <- data.dt$x[2:(t-1)]
y.IS <- data.dt$y.boot[2:(t-1)]
reg.IS <- lm(y.IS ~ x.IS)
x.new <- data.dt$x[t]
y.pred <- predict(reg.IS, newdata = data.frame(x.IS = x.new))
con_pred[t] <- y.pred
}
data.dt$conditional <- con_pred
data.dt <- data.dt %>%
mutate(unconditional = lag(cummean(y.boot))) %>%
na.omit()
error_N.boot = data.dt$y.boot - data.dt$unconditional
error_A.boot = data.dt$y.boot - data.dt$conditional
MSE_N.boot = mean(error_N.boot ^ 2)
MSE_A.boot = mean(error_A.boot ^ 2)
OOS_r.squared.boot <- 1 - MSE_A.boot / MSE_N.boot # out-of-sample R square
### MSE-F statistic
MSE_F.boot <- length(error_N.boot) * (MSE_N.boot - MSE_A.boot) / (MSE_A.boot)
boot.critical[i] <- MSE_F.boot
if (i %% (boot.times/10) == 0) {
timestamp()
print(paste(predictor, ": ", i %/% (boot.times/100), "%", sep = ""))
}
}
## store the results
result <- cbind.data.frame(IS_r.squared,
OOS_r.squared,
MAE_A, # MAE of conditional model
MSE_F,
t(quantile(boot.critical, probs = c(0.90, 0.95, 0.99))),
p.value = mean(boot.critical > MSE_F))
# d_RMSE = round(d_RMSE, digits = 4),
# DM_stat = round(DM_test.result$statistic, digits = 4),
# DM_pval = round(DM_test.result$p.value, digits = 4))
} else {
result <- cbind.data.frame(IS_r.squared,
OOS_r.squared,
MAE_A, # MAE of conditional model
MSE_F)
}
rownames(result) <- predictor
result
}
## store the results for all predictors
table2.uni_predictors <- rbind.data.frame(table2.uni_predictors, result)
}
table2.uni_predictors
## 6. statistics summary ----
table2 <- rbind.data.frame(table2.uni_predictors, "SOP" = sop.result$result)
if ("p.value" %in% colnames(table2)) {
table2$star <- ifelse(table2$p.value <= 0.01, "***", ifelse(table2$p.value <= 0.05, "**", ifelse(table2$p.value <= 0.1, "*", "")))
}
# statistical significance from McCracken (2007)
table2$McCracken <- ifelse(table2$MSE_F >= 3.838, "***", ifelse(table2$MSE_F >= 1.599, "**", ifelse(table2$MSE_F >= 0.685, "*", "")))
table2
TABLE2[[id.names[c]]] <- table2
table2$rowname <- rownames(table2)
table2$portname <- id.names[c]
table2.df <- rbind.data.frame(table2.df, table2)
## 7. cumulative OOS R2 difference merged dataset ----
colnames(Cuml_all.ts) <- c("SOP", ratio_names)
par(mfrow = c(3,3))
for (method in colnames(Cuml_all.ts)) {
par(mar = c(2, 2.5, 2, 0.5))
plot.ts(window(Cuml_all.ts[, method], start = 1990), xlab = NULL, lty = 2, ylab = NULL, main = method, cex.main = 0.8, xaxt = "n", las = 2)
axis(1, seq(1990, 2020, by = 10), cex.axis = 0.8)
}
colnames(Csse_all.ts) <- c("SOP", ratio_names)
par(mfrow = c(3,3))
for (method in colnames(Csse_all.ts)) {
par(mar = c(2, 2.5, 2, 0.5))
plot.ts(Csse_all.ts[, method], xlab = NULL, lty = 2, ylab = NULL, main = method, cex.main = 0.8, xaxt = "n", las = 2)
axis(1, seq(1990, 2020, by = 10), cex.axis = 0.8)
}
par(mfrow = c(1,1), mar = c(5, 4, 4, 2) + 0.1)
}
## [1] "market_Allfirms.csv"
## [1] "Market"
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(actual)` instead of `actual` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(cond)` instead of `cond` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(uncond)` instead of `uncond` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(x)` instead of `x` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## [1] "OOS R Squared: -0.0379"
## [1] "MSE-F: -14.852"
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(predictor)` instead of `predictor` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## [1] "IS R Squared: 0.0083"
## [1] "OOS R Squared: -0.0135"
## [1] "MSE-F: -5.3965"
## [1] "IS R Squared: 5e-04"
## [1] "OOS R Squared: -0.0179"
## [1] "MSE-F: -7.1247"
## [1] "IS R Squared: 4e-04"
## [1] "OOS R Squared: -0.0203"
## [1] "MSE-F: -8.067"
## [1] "IS R Squared: 0.0097"
## [1] "OOS R Squared: -0.0138"
## [1] "MSE-F: -5.5158"
## [1] "IS R Squared: 0.0028"
## [1] "OOS R Squared: -0.0055"
## [1] "MSE-F: -2.2225"
## [1] "port_BH.csv"
## [1] "BH"
## [1] "OOS R Squared: -0.1251"
## [1] "MSE-F: -45.25"
## [1] "IS R Squared: 0.0114"
## [1] "OOS R Squared: -0.0021"
## [1] "MSE-F: -0.8315"
## [1] "IS R Squared: 0.0016"
## [1] "OOS R Squared: -0.0089"
## [1] "MSE-F: -3.5668"
## [1] "IS R Squared: 0.0017"
## [1] "OOS R Squared: -0.0088"
## [1] "MSE-F: -3.547"
## [1] "IS R Squared: 0.0129"
## [1] "OOS R Squared: 3e-04"
## [1] "MSE-F: 0.1056"
## [1] "IS R Squared: 6e-04"
## [1] "OOS R Squared: -0.0117"
## [1] "MSE-F: -4.6891"
## [1] "port_BL.csv"
## [1] "BL"
## [1] "OOS R Squared: -1e-04"
## [1] "MSE-F: -0.0586"
## [1] "IS R Squared: 0.0066"
## [1] "OOS R Squared: -0.0016"
## [1] "MSE-F: -0.6326"
## [1] "IS R Squared: 0.0074"
## [1] "OOS R Squared: 0.0015"
## [1] "MSE-F: 0.5969"
## [1] "IS R Squared: 0.0086"
## [1] "OOS R Squared: 0.0024"
## [1] "MSE-F: 0.97"
## [1] "IS R Squared: 0.0077"
## [1] "OOS R Squared: -0.0011"
## [1] "MSE-F: -0.4457"
## [1] "IS R Squared: 1e-04"
## [1] "OOS R Squared: -0.0045"
## [1] "MSE-F: -1.8376"
## [1] "port_BM.csv"
## [1] "BM"
## [1] "OOS R Squared: -0.0328"
## [1] "MSE-F: -12.9152"
## [1] "IS R Squared: 0.0025"
## [1] "OOS R Squared: -0.0177"
## [1] "MSE-F: -7.0796"
## [1] "IS R Squared: 0"
## [1] "OOS R Squared: -0.0513"
## [1] "MSE-F: -19.8056"
## [1] "IS R Squared: 0"
## [1] "OOS R Squared: -0.0513"
## [1] "MSE-F: -19.8285"
## [1] "IS R Squared: 0.0028"
## [1] "OOS R Squared: -0.0167"
## [1] "MSE-F: -6.6568"
## [1] "IS R Squared: 1e-04"
## [1] "OOS R Squared: -0.0484"
## [1] "MSE-F: -18.7299"
## [1] "port_SH.csv"
## [1] "SH"
## [1] "OOS R Squared: -0.5211"
## [1] "MSE-F: -139.4333"
## [1] "IS R Squared: 0.0015"
## [1] "OOS R Squared: -0.0182"
## [1] "MSE-F: -7.2402"
## [1] "IS R Squared: 3e-04"
## [1] "OOS R Squared: -0.0055"
## [1] "MSE-F: -2.2295"
## [1] "IS R Squared: 3e-04"
## [1] "OOS R Squared: -0.0054"
## [1] "MSE-F: -2.1623"
## [1] "IS R Squared: 0.0037"
## [1] "OOS R Squared: -0.023"
## [1] "MSE-F: -9.1349"
## [1] "IS R Squared: 2e-04"
## [1] "OOS R Squared: -0.0064"
## [1] "MSE-F: -2.5807"
## [1] "port_SL.csv"
## [1] "SL"
## [1] "OOS R Squared: -0.1544"
## [1] "MSE-F: -54.4467"
## [1] "IS R Squared: 0.0083"
## [1] "OOS R Squared: -0.0121"
## [1] "MSE-F: -4.837"
## [1] "IS R Squared: 0"
## [1] "OOS R Squared: -0.0127"
## [1] "MSE-F: -5.0936"
## [1] "IS R Squared: 0"
## [1] "OOS R Squared: -0.0153"
## [1] "MSE-F: -6.1138"
## [1] "IS R Squared: 0.0111"
## [1] "OOS R Squared: -0.0155"
## [1] "MSE-F: -6.2142"
## [1] "IS R Squared: 2e-04"
## [1] "OOS R Squared: -0.0228"
## [1] "MSE-F: -9.0543"
## [1] "port_SM.csv"
## [1] "SM"
## [1] "OOS R Squared: -0.1648"
## [1] "MSE-F: -57.5885"
## [1] "IS R Squared: 0.0037"
## [1] "OOS R Squared: -0.0126"
## [1] "MSE-F: -5.0385"
## [1] "IS R Squared: 0.003"
## [1] "OOS R Squared: -0.0033"
## [1] "MSE-F: -1.3546"
## [1] "IS R Squared: 0.0028"
## [1] "OOS R Squared: -0.0047"
## [1] "MSE-F: -1.8966"
## [1] "IS R Squared: 0.006"
## [1] "OOS R Squared: -0.0191"
## [1] "MSE-F: -7.5974"
## [1] "IS R Squared: 0.004"
## [1] "OOS R Squared: -9e-04"
## [1] "MSE-F: -0.3848"
This table demonstrates the in-sample and out-of-sample R-squares for the market and six size and book-to-market equity ratio sorted portfolios from predictive regressions and the Sum-of-the-Parts method. IS R-squares are estimated using the whole sample period and the OOS R-squares are calculated compare the forecast error of the model against the historical mean model. The full sample period starts from Feb 1966 to December 2019 and the IS period is set to be 20 years with forecsats beginning in Feb 1986. The MSE-F statistics are calculated to test the hypothesis \(H_0: \text{out-of-sample R-squares} = 0\) vs \(H_1: \text{out-of-sample R-squares} \neq 0\).
Predictors here are all in log terms.
gt(table2.df, rowname_col = "rowname", groupname_col = "portname") %>%
tab_header(title = "Table 2 - Forecasts of portfolio returns",
subtitle = paste(freq_name(freq = freq), " data starts from ", first(data_decompose$month), " and ends in ", last(data_decompose$month), ".", sep = "")) %>%
fmt_number(columns = 1:4, decimals = 6, suffixing = TRUE)
| Table 2 - Forecasts of portfolio returns | |||||
|---|---|---|---|---|---|
| monthly data starts from Feb 1966 and ends in Dec 2019. | |||||
| IS_r.squared | OOS_r.squared | MAE_A | MSE_F | McCracken | |
| Market | |||||
| DP | 0.008341 | −0.013471 | 0.033688 | −5.396537 | |
| PE | 0.000497 | −0.017862 | 0.033200 | −7.124701 | |
| EY | 0.000404 | −0.020272 | 0.033270 | −8.067006 | |
| DY | 0.009736 | −0.013773 | 0.033781 | −5.515775 | |
| Payout | 0.002776 | −0.005504 | 0.032549 | −2.222544 | |
| SOP | NA | −0.037873 | 0.033120 | −14.851966 | |
| BH | |||||
| DP | 0.011372 | −0.002052 | 0.035642 | −0.831458 | |
| PE | 0.001588 | −0.008863 | 0.035109 | −3.566768 | |
| EY | 0.001669 | −0.008814 | 0.035111 | −3.547031 | |
| DY | 0.012858 | 0.000260 | 0.035650 | 0.105551 | |
| Payout | 0.000623 | −0.011684 | 0.034990 | −4.689096 | |
| SOP | NA | −0.125086 | 0.037888 | −45.250032 | |
| BL | |||||
| DP | 0.006620 | −0.001561 | 0.034211 | −0.632646 | |
| PE | 0.007436 | 0.001468 | 0.034144 | 0.596908 | |
| EY | 0.008587 | 0.002383 | 0.034126 | 0.969995 | * |
| DY | 0.007719 | −0.001099 | 0.034210 | −0.445663 | |
| Payout | 0.000060 | −0.004547 | 0.034026 | −1.837591 | |
| SOP | NA | −0.000144 | 0.033867 | −0.058633 | |
| BM | |||||
| DP | 0.002479 | −0.017747 | 0.032528 | −7.079636 | |
| PE | 0.000017 | −0.051284 | 0.031991 | −19.805606 | |
| EY | 0.000026 | −0.051346 | 0.031996 | −19.828484 | |
| DY | 0.002845 | −0.016669 | 0.032531 | −6.656812 | |
| Payout | 0.000133 | −0.048364 | 0.031498 | −18.729904 | |
| SOP | NA | −0.032773 | 0.031677 | −12.915206 | |
| SH | |||||
| DP | 0.001519 | −0.018157 | 0.039478 | −7.240162 | |
| PE | 0.000271 | −0.005522 | 0.039231 | −2.229540 | |
| EY | 0.000312 | −0.005354 | 0.039237 | −2.162327 | |
| DY | 0.003696 | −0.023018 | 0.039673 | −9.134863 | |
| Payout | 0.000199 | −0.006397 | 0.039227 | −2.580653 | |
| SOP | NA | −0.521116 | 0.053183 | −139.433314 | |
| SL | |||||
| DP | 0.008256 | −0.012057 | 0.049408 | −4.837020 | |
| PE | 0.000017 | −0.012705 | 0.049646 | −5.093602 | |
| EY | 0.000006 | −0.015289 | 0.049829 | −6.113785 | |
| DY | 0.011124 | −0.015544 | 0.049476 | −6.214239 | |
| Payout | 0.000213 | −0.022810 | 0.049548 | −9.054271 | |
| SOP | NA | −0.154435 | 0.053362 | −54.446666 | |
| SM | |||||
| DP | 0.003725 | −0.012566 | 0.038938 | −5.038514 | |
| PE | 0.003001 | −0.003348 | 0.038679 | −1.354584 | |
| EY | 0.002768 | −0.004693 | 0.038714 | −1.896646 | |
| DY | 0.006001 | −0.019070 | 0.039103 | −7.597391 | |
| Payout | 0.004041 | −0.000949 | 0.038618 | −0.384807 | |
| SOP | NA | −0.164816 | 0.042175 | −57.588485 | |